install.packages("tidyverse")
install.packages("lubridate")
install.packages("dplyr")
install.packages("ggplot2")
install.packages("ggthemes")
library(tidyverse)
library(lubridate)
library(dplyr)
library(ggplot2)
library(ggthemes)
getwd()
setwd("/Users/tushy/OneDrive/Desktop/Capstone Google/Capstone")
biketripdata_2020_10 <- read.csv("202010-divvy-tripdata.csv")
biketripdata_2020_11 <- read.csv("202011-divvy-tripdata.csv")
biketripdata_2020_12 <- read.csv("202012-divvy-tripdata.csv")
biketripdata_2021_01 <- read.csv("202101-divvy-tripdata.csv")
biketripdata_2021_02 <- read.csv("202102-divvy-tripdata.csv")
biketripdata_2021_03 <- read.csv("202103-divvy-tripdata.csv")
biketripdata_2021_04 <- read.csv("202104-divvy-tripdata.csv")
biketripdata_2021_05 <- read.csv("202105-divvy-tripdata.csv")
biketripdata_2021_06 <- read.csv("202106-divvy-tripdata.csv")
biketripdata_2021_07 <- read.csv("202107-divvy-tripdata.csv")
biketripdata_2021_08 <- read.csv("202108-divvy-tripdata.csv")
biketripdata_2021_09 <- read.csv("202109-divvy-tripdata.csv")
biketripdata_2020_10 <- mutate(biketripdata_2020_10, start_station_id = as.character(start_station_id), end_station_id = as.character(end_station_id))
biketripdata_2020_11 <- mutate(biketripdata_2020_11, start_station_id = as.character(start_station_id), end_station_id = as.character(end_station_id))
tot_biketrips <-bind_rows(
biketripdata_2020_10,
biketripdata_2020_11,
biketripdata_2020_12,
biketripdata_2021_01,
biketripdata_2021_02,
biketripdata_2021_03,
biketripdata_2021_04,
biketripdata_2021_05,
biketripdata_2021_06,
biketripdata_2021_07,
biketripdata_2021_08,
biketripdata_2021_09)
colnames(tot_biketrips)
[1] "ride_id" "rideable_type" "started_at" "ended_at"
[5] "start_station_name" "start_station_id" "end_station_name" "end_station_id"
[9] "start_lat" "start_lng" "end_lat" "end_lng"
[13] "member_casual"
nrow(tot_biketrips)
[1] 5136261
summary(tot_biketrips)
ride_id rideable_type started_at ended_at start_station_name
Length:5136261 Length:5136261 Length:5136261 Length:5136261 Length:5136261
Class :character Class :character Class :character Class :character Class :character
Mode :character Mode :character Mode :character Mode :character Mode :character
start_station_id end_station_name end_station_id start_lat start_lng
Length:5136261 Length:5136261 Length:5136261 Min. :41.64 Min. :-87.84
Class :character Class :character Class :character 1st Qu.:41.88 1st Qu.:-87.66
Mode :character Mode :character Mode :character Median :41.90 Median :-87.64
Mean :41.90 Mean :-87.65
3rd Qu.:41.93 3rd Qu.:-87.63
Max. :42.08 Max. :-87.52
end_lat end_lng member_casual
Min. :41.51 Min. :-88.07 Length:5136261
1st Qu.:41.88 1st Qu.:-87.66 Class :character
Median :41.90 Median :-87.64 Mode :character
Mean :41.90 Mean :-87.65
3rd Qu.:41.93 3rd Qu.:-87.63
Max. :42.17 Max. :-87.44
NA's :4821 NA's :4821
colnames(tot_biketrips)
[1] "ride_id" "rideable_type" "started_at" "ended_at"
[5] "start_station_name" "start_station_id" "end_station_name" "end_station_id"
[9] "start_lat" "start_lng" "end_lat" "end_lng"
[13] "member_casual"
nrow(tot_biketrips)
[1] 5136261
summary(tot_biketrips)
ride_id rideable_type started_at ended_at start_station_name
Length:5136261 Length:5136261 Length:5136261 Length:5136261 Length:5136261
Class :character Class :character Class :character Class :character Class :character
Mode :character Mode :character Mode :character Mode :character Mode :character
start_station_id end_station_name end_station_id start_lat start_lng
Length:5136261 Length:5136261 Length:5136261 Min. :41.64 Min. :-87.84
Class :character Class :character Class :character 1st Qu.:41.88 1st Qu.:-87.66
Mode :character Mode :character Mode :character Median :41.90 Median :-87.64
Mean :41.90 Mean :-87.65
3rd Qu.:41.93 3rd Qu.:-87.63
Max. :42.08 Max. :-87.52
end_lat end_lng member_casual
Min. :41.51 Min. :-88.07 Length:5136261
1st Qu.:41.88 1st Qu.:-87.66 Class :character
Median :41.90 Median :-87.64 Mode :character
Mean :41.90 Mean :-87.65
3rd Qu.:41.93 3rd Qu.:-87.63
Max. :42.17 Max. :-87.44
NA's :4821 NA's :4821
head(tot_biketrips)
ride_id <chr> | rideable_type <chr> | started_at <chr> | ended_at <chr> | ||
---|---|---|---|---|---|
1 | ACB6B40CF5B9044C | electric_bike | 2020-10-31 19:39:43 | 2020-10-31 19:57:12 | |
2 | DF450C72FD109C01 | electric_bike | 2020-10-31 23:50:08 | 2020-11-01 00:04:16 | |
3 | B6396B54A15AC0DF | electric_bike | 2020-10-31 23:00:01 | 2020-10-31 23:08:22 | |
4 | 44A4AEE261B9E854 | electric_bike | 2020-10-31 22:16:43 | 2020-10-31 22:19:35 | |
5 | 10B7DD76A6A2EB95 | electric_bike | 2020-10-31 19:38:19 | 2020-10-31 19:54:32 | |
6 | DA6C3759660133DA | electric_bike | 2020-10-29 17:38:04 | 2020-10-29 17:45:43 |
tot_biketrips_clean$date <- as.Date(tot_biketrips_clean$started_at)
tot_biketrips_clean$month <- format(as.Date(tot_biketrips_clean$date), "%m")
tot_biketrips_clean$day <- format(as.Date(tot_biketrips_clean$date), "%d")
tot_biketrips_clean$year <- format(as.Date(tot_biketrips_clean$date), "%Y")
tot_biketrips_clean$day_of_week <- format(as.Date(tot_biketrips_clean$date), "%A")
glimpse(tot_biketrips_clean)
Rows: 5,045,921
Columns: 18
$ ride_id <chr> "ACB6B40CF5B9044C", "DF450C72FD109C01", "B6396B54A15AC0DF", "44A4AEE261~
$ rideable_type <chr> "electric_bike", "electric_bike", "electric_bike", "electric_bike", "el~
$ started_at <chr> "2020-10-31 19:39:43", "2020-10-31 23:50:08", "2020-10-31 23:00:01", "2~
$ ended_at <chr> "2020-10-31 19:57:12", "2020-11-01 00:04:16", "2020-10-31 23:08:22", "2~
$ start_station_name <chr> "Lakeview Ave & Fullerton Pkwy", "Southport Ave & Waveland Ave", "Stony~
$ start_station_id <chr> "313", "227", "102", "165", "190", "359", "313", "125", "174", "114", "~
$ end_station_name <chr> "Rush St & Hubbard St", "Kedzie Ave & Milwaukee Ave", "University Ave &~
$ end_station_id <chr> "125", "260", "423", "256", "185", "53", "125", "313", "635", "303", "1~
$ start_lat <dbl> 41.92610, 41.94817, 41.77346, 41.95085, 41.92886, 41.90353, 41.92584, 4~
$ start_lng <dbl> -87.63898, -87.66391, -87.58537, -87.65924, -87.66396, -87.64335, -87.6~
$ end_lat <dbl> 41.89035, 41.92953, 41.79145, 41.95281, 41.91778, 41.89440, 41.89047, 4~
$ end_lng <dbl> -87.62607, -87.70782, -87.60005, -87.65010, -87.69143, -87.63431, -87.6~
$ member_casual <chr> "casual", "casual", "casual", "casual", "casual", "casual", "casual", "~
$ date <date> 2020-10-31, 2020-10-31, 2020-10-31, 2020-10-31, 2020-10-31, 2020-10-29~
$ month <chr> "10", "10", "10", "10", "10", "10", "10", "10", "10", "10", "10", "10",~
$ day <chr> "31", "31", "31", "31", "31", "29", "29", "29", "29", "28", "29", "29",~
$ year <chr> "2020", "2020", "2020", "2020", "2020", "2020", "2020", "2020", "2020",~
$ day_of_week <chr> "Saturday", "Saturday", "Saturday", "Saturday", "Saturday", "Thursday",~
tot_biketrips_clean$ride_length <- difftime(tot_biketrips_clean$ended_at,tot_biketrips_clean$started_at)
###3 Checking data type for consistency
tot_biketrips_v2 <- tot_biketrips_clean[!(tot_biketrips_clean$start_station_name == "HQ QR" | tot_biketrips_clean$ride_length<0),]
min(tot_biketrips_v2$ride_length) # shortest ride
[1] 0
max(tot_biketrips_v2$ride_length) # longest ride
[1] 3356649
mean(tot_biketrips_v2$ride_length) # average (total ride length / rides)
[1] 1326.167
median(tot_biketrips_v2$ride_length) # midpoint
[1] 757
summary(tot_biketrips_v2)
ride_id rideable_type started_at ended_at start_station_name
Length:5042638 Length:5042638 Length:5042638 Length:5042638 Length:5042638
Class :character Class :character Class :character Class :character Class :character
Mode :character Mode :character Mode :character Mode :character Mode :character
start_station_id end_station_name end_station_id start_lat start_lng
Length:5042638 Length:5042638 Length:5042638 Min. :41.64 Min. :-87.84
Class :character Class :character Class :character 1st Qu.:41.88 1st Qu.:-87.66
Mode :character Mode :character Mode :character Median :41.90 Median :-87.64
Mean :41.90 Mean :-87.65
3rd Qu.:41.93 3rd Qu.:-87.63
Max. :42.07 Max. :-87.52
end_lat end_lng member_casual date month
Min. :41.51 Min. :-88.07 Length:5042638 Min. :2020-10-01 Length:5042638
1st Qu.:41.88 1st Qu.:-87.66 Class :character 1st Qu.:2021-04-18 Class :character
Median :41.90 Median :-87.64 Mode :character Median :2021-06-23 Mode :character
Mean :41.90 Mean :-87.65 Mean :2021-05-29
3rd Qu.:41.93 3rd Qu.:-87.63 3rd Qu.:2021-08-12
Max. :42.17 Max. :-87.49 Max. :2021-09-30
day year day_of_week ride_length
Length:5042638 Length:5042638 Length:5042638 Min. : 0
Class :character Class :character Class :character 1st Qu.: 427
Mode :character Mode :character Mode :character Median : 757
Mean : 1326
3rd Qu.: 1369
Max. :3356649
aggregate(tot_biketrips_v2$ride_length ~ tot_biketrips_v2$member_casual, FUN = mean)
aggregate(tot_biketrips_v2$ride_length ~ tot_biketrips_v2$member_casual, FUN = median)
aggregate(tot_biketrips_v2$ride_length ~ tot_biketrips_v2$member_casual, FUN = max)
aggregate(tot_biketrips_v2$ride_length ~ tot_biketrips_v2$member_casual, FUN = min)
aggregate(tot_biketrips_v2$ride_length ~ tot_biketrips_v2$member_casual + tot_biketrips_v2$day_of_week, FUN = mean)
tot_biketrips_v2$member_casual <chr> | tot_biketrips_v2$day_of_week <chr> | tot_biketrips_v2$ride_length <dbl> | |
---|---|---|---|
casual | Friday | 1819.9010 | |
member | Friday | 823.9347 | |
casual | Monday | 1885.4267 | |
member | Monday | 803.7666 | |
casual | Saturday | 2067.1800 | |
member | Saturday | 926.4486 | |
casual | Sunday | 2213.0851 | |
member | Sunday | 943.8355 | |
casual | Thursday | 1633.4148 | |
member | Thursday | 783.7299 |
tot_biketrips_v2$day_of_week <- ordered(tot_biketrips_v2$day_of_week, levels=c("Sunday", "Monday", "Tuesday", "Wednesday", "Thursday", "Friday", "Saturday"))
aggregate(tot_biketrips_v2$ride_length ~ tot_biketrips_v2$member_casual + tot_biketrips_v2$day_of_week, FUN = mean)
tot_biketrips_v2$member_casual <chr> | tot_biketrips_v2$day_of_week <ord> | tot_biketrips_v2$ride_length <dbl> | |
---|---|---|---|
casual | Sunday | 2213.0851 | |
member | Sunday | 943.8355 | |
casual | Monday | 1885.4267 | |
member | Monday | 803.7666 | |
casual | Tuesday | 1702.6415 | |
member | Tuesday | 786.1274 | |
casual | Wednesday | 1657.3003 | |
member | Wednesday | 790.6077 | |
casual | Thursday | 1633.4148 | |
member | Thursday | 783.7299 |
`
tot_biketrips_v2 %>%
mutate(weekday = wday(started_at, label = TRUE)) %>% #creates weekday field using wday()
group_by(member_casual, weekday) %>% #groups by usertype and weekday
summarise(number_of_rides = n() #calculates the number of rides and average duration
,average_duration = mean(ride_length)) %>% # calculates the average duration
arrange(member_casual, weekday) # sorts
`summarise()` has grouped output by 'member_casual'. You can override using the `.groups` argument.
member_casual <chr> | weekday <ord> | number_of_rides <int> | average_duration <dbl> | |
---|---|---|---|---|
casual | Sun | 438955 | 2213.0851 | |
casual | Mon | 262243 | 1885.4267 | |
casual | Tue | 247212 | 1702.6415 | |
casual | Wed | 252639 | 1657.3003 | |
casual | Thu | 269059 | 1633.4148 | |
casual | Fri | 332637 | 1819.9010 | |
casual | Sat | 514655 | 2067.1800 | |
member | Sun | 338824 | 943.8355 | |
member | Mon | 369040 | 803.7666 | |
member | Tue | 400265 | 786.1274 |
View(tot_biketrips_v2)
tot_biketrips_v2 %>%
mutate(weekday = wday(started_at, label = TRUE)) %>%
group_by(member_casual, weekday) %>%
summarise(number_of_rides = n()
,average_duration = mean(ride_length)) %>%
arrange(member_casual, weekday) %>%
ggplot(aes(x = weekday, y = number_of_rides, fill = member_casual, caption = "Data by Motivate International Inc")) +
geom_col(position = "dodge")+
theme_bw()
`summarise()` has grouped output by 'member_casual'. You can override using the `.groups` argument.
NA
NA
`summarise()` has grouped output by 'member_casual'. You can override using the `.groups` argument.
type_of_bike <- tot_biketrips_clean %>% filter(rideable_type=="classic_bike" | rideable_type=="electric_bike")
type_of_bike %>%
group_by(member_casual,rideable_type) %>%
summarise(totals=n(), .groups="drop") %>%
ggplot()+
geom_col(aes(x=member_casual,y=totals,fill=rideable_type), position = "dodge") +
labs(title = "Type of bike usage by Rider type",x="Rider type",y=NULL, fill="Bike type") +
scale_fill_manual(values = c("classic_bike" = "#7291C9","electric_bike" = "#79CC85")) +
theme_bw() +
theme(legend.position="top")
type_of_bike %>%
mutate(weekday = wday(started_at, label = TRUE)) %>%
group_by(member_casual,rideable_type,weekday) %>%
summarise(totals=n(), .groups="drop") %>%
ggplot(aes(x=weekday,y=totals, fill=rideable_type)) +
geom_col(position = "dodge") +
facet_wrap(~member_casual) +
labs(title = "Bike type usage by user type during a week",x="User type",y=NULL,caption = "Data by Motivate International Inc") +
scale_fill_manual(values = c("classic_bike" = "#7291C9","electric_bike" = "#79CC85")) +
theme_bw() +
theme(legend.position="top")
NA
NA
NA